#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(WGCNA)
## ==========================================================================
## *
## *  Package WGCNA 1.66 loaded.
## *
## *    Important note: It appears that your system supports multi-threading,
## *    but it is not enabled within WGCNA in R. 
## *    To allow multi-threading within WGCNA with all available cores, use 
## *
## *          allowWGCNAThreads()
## *
## *    within R. Use disableWGCNAThreads() to disable threading if necessary.
## *    Alternatively, set the following environment variable on your system:
## *
## *          ALLOW_WGCNA_THREADS=<number_of_processors>
## *
## *    for example 
## *
## *          ALLOW_WGCNA_THREADS=4
## *
## *    To set the environment variable in linux bash shell, type 
## *
## *           export ALLOW_WGCNA_THREADS=4
## *
## *     before running R. Other operating systems or shells will
## *     have a similar command to achieve the same aim.
## *
## ==========================================================================
library(expss)
library(polycor)
library(knitr)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

clustering_selected = 'DynamicHybridMergedSmall'

print(paste0('Using clustering ', clustering_selected))
## [1] "Using clustering DynamicHybridMergedSmall"

Load preprocessed dataset (preprocessing code in 19_10_14_data_preprocessing.Rmd) and clustering (pipeline in 19_10_15_WGCNA.Rmd)

# Gandal dataset
load('./../Data/Gandal/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame


# GO Neuronal annotations
GO_annotations = read.csv('./../../../PhD-InitialExperiments/FirstYearReview/Data/GO_annotations/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_08-29-2019_with_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>% 
  mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
  left_join(GO_neuronal, by='ID') %>% 
  mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
  mutate(gene.score=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`), 
         significant=padj<0.05 & !is.na(padj))


# Dataset created with DynamicTreeMerged algorithm
dataset = read.csv(paste0('./../Data/Gandal/dataset_', clustering_selected, '.csv'))
dataset$Module = dataset[,clustering_selected]

rm(DE_info, GO_annotations, clusterings)
print(paste0('There are ', length(unique(dataset$Module)),' modules:'))
## [1] "There are 36 modules:"
dataset$Module %>% table %>% sort(decreasing=TRUE) %>% print
## .
## #53B400 #00BD64 #C17FFF #F17E4F #E76BF3 #F365E6 #C49A00 #FD6F86 #8195FF 
##    1930    1555    1339    1253    1149    1086     977     942     856 
## #00C0BB #4B9FFF #00C094 #00BB45 #00B70C #00BF7D #D29300 #FF61C5    gray 
##     738     575     566     414     400     327     314     247     178 
## #E88523 #F8766D #00A8FF #FF64B2 #75AF00 #8EAB00 #A58AFF #A3A500 #00BECD 
##     176     160     131     128     125     120     114      99      97 
## #D774FD #00B0F6 #00B6EB #00BBDD #FF699D #B4A000 #DE8C00 #FB61D7 #00C1A8 
##      97      95      94      82      72      45      45      42      32

Module-Trait associations

datTraits = datMeta %>% dplyr::select(Diagnosis_, Region, Sex, Age, PMI, RNAExtractionBatch) %>%
            rename('Diagnosis' = Diagnosis_, 'ExtractionBatch' = RNAExtractionBatch)

# Recalculate MEs with color labels
ME_object = datExpr %>% t %>% moduleEigengenes(colors = dataset$Module)
MEs = orderMEs(ME_object$eigengenes)

# Calculate correlation between eigengenes and the traits and their p-values
moduleTraitCor = MEs %>% apply(2, function(x) hetcor(x, datTraits, ML=TRUE)$correlations[1,-1]) %>% t
rownames(moduleTraitCor) = colnames(MEs)
colnames(moduleTraitCor) = colnames(datTraits)
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nrow(datExpr))

# In case there are any NAs
if(sum(!complete.cases(moduleTraitCor))>0){
  print(paste0(sum(is.na(moduleTraitCor)),' correlation(s) could not be calculated')) 
}
## [1] "1 correlation(s) could not be calculated"
moduleTraitCor = moduleTraitCor[complete.cases(moduleTraitCor),]
moduleTraitPvalue = moduleTraitPvalue[complete.cases(moduleTraitCor),]

# Sort moduleTraitCor by Diagnosis
moduleTraitCor = moduleTraitCor[order(moduleTraitCor[,1], decreasing=TRUE),]
moduleTraitPvalue = moduleTraitPvalue[order(moduleTraitCor[,1], decreasing=TRUE),]

# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)


labeledHeatmap(Matrix = moduleTraitCor, xLabels = names(datTraits), yLabels =  gsub('ME','',rownames(moduleTraitCor)), 
               yColorWidth=0, colors = brewer.pal(11,'PiYG'), bg.lab.y = gsub('ME','',rownames(moduleTraitCor)),
               textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.8, cex.lab.y = 0.75, zlim = c(-1,1),
               main = paste('Module-Trait relationships'))

rm(ME_object, datTraits, MEs, moduleTraitCor, moduleTraitPvalue, textMatrix)

Selecting the three modules with highest (absolute) correlation to Diagnosis: #C17FFF #00C094 and #C49A00

pca = datExpr %>% prcomp

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(dataset, by='ID') %>% dplyr::select(ID, PC1, PC2, Module, gene.score) %>%
            mutate(ImportantModules = ifelse(Module=='#C17FFF', '#C17FFF',
                                      ifelse(Module=='#00C094', '#00C094',
                                      ifelse(Module=='#C49A00', '#C49A00', 'Others')))) %>%
            mutate(color = ifelse(ImportantModules=='Others','gray',ImportantModules),
                   alpha = ifelse(ImportantModules=='Others', 0.1, 0.5))

ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=ImportantModules)) + 
         geom_point(alpha=plot_data$alpha, color=plot_data$color) + theme_minimal() + 
           ggtitle('Modules with strongest relation to Diagnosis'))

SFARI genes and Module-Diagnosis significance

Modules with the strongest module-diagnosis correlation should have the highest percentage of SFARI Genes, but the plot shows this relation in modules with correlations close to 0, and thi behaviour is the same for the other clusterings I have tried…

plot_data = dataset %>% mutate('hasSFARIscore' = gene.score!='None') %>% 
            group_by(Module, MTcor, hasSFARIscore) %>% summarise(p=n()) %>% 
            left_join(dataset %>% group_by(Module) %>% summarise(n=n()), by='Module') %>% 
            mutate(p=round(p/n*100,2)) 

for(i in 1:nrow(plot_data)){
  this_row = plot_data[i,]
  if(this_row$hasSFARIscore==FALSE & this_row$p==100){
    new_row = this_row
    new_row$hasSFARIscore = TRUE
    new_row$p = 0
    plot_data = plot_data %>% rbind(new_row)
  }
}

plot_data = plot_data %>% filter(hasSFARIscore==TRUE)

ggplotly(plot_data %>% ggplot(aes(MTcor, p, size=n)) + geom_smooth(color='gray', se=FALSE) +
         geom_point(color=plot_data$Module, aes(id=Module)) +
         xlab('Module-Diagnosis correlation') + ylab('% of SFARI genes') +
         theme_minimal() + theme(legend.position = 'none'))
rm(i, this_row, new_row)

Gene Significance

Gene significance close to 0 means the gene is not significant and negative values means the gene is underexpressed in Autism samples. So the important thing about gene significance is it’s absolute value

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(dataset, by='ID')

ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=GS)) + geom_point(alpha=0.5) + 
         scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) +
  theme_minimal() + ggtitle('Gene Significance'))

Module Membership and Gene Significance relation

Genes from the module #C17FFF

Module with the highest (absolute) correlation to Diagnosis (-0.92)

plot_data = dataset %>% dplyr::select(MM.C17FFF, GS, gene.score) %>% filter(dataset$Module=='#C17FFF')

ggplotly(plot_data %>% ggplot(aes(MM.C17FFF, GS, color=gene.score)) + geom_point(alpha=0.5) + 
         scale_color_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + 
         theme_minimal())

Genes from the module #C49A00

Module with the second highest (absolute) correlation to Diagnosis (-0.86)

plot_data = dataset %>% dplyr::select(MM.C49A00, GS, gene.score) %>% filter(dataset$Module=='#C49A00')

ggplotly(plot_data %>% ggplot(aes(MM.C49A00, GS, color=gene.score)) + geom_point(alpha=0.5) + 
         scale_color_manual(values=SFARI_colour_hue(r=c(2:6,8,7))) + 
         theme_minimal())

Genes from the module #00C094

Module with the third highest (absolute) correlation to Diagnosis and the top one with positive correlation (0.86)

plot_data = dataset %>% dplyr::select(MM.00C094, GS, gene.score) %>% filter(dataset$Module=='#00C094')


ggplotly(plot_data %>% ggplot(aes(MM.00C094, GS, color=gene.score)) + geom_point(alpha=0.5) + 
         scale_color_manual(values=SFARI_colour_hue(r=c(3:6,8,7))) + 
         theme_minimal())

Identifying important genes

Selecting the modules with the highest correlation to Diagnosis, and, from them, the genes with the highest module membership-(absolute) gene significance

*Ordered by \(\frac{MM+GS}{2}\)

Only one SFARI Gene in the two lists …

top_genes_1 = dataset %>% dplyr::select(ID, MM.C17FFF, GS, gene.score) %>% filter(dataset$Module=='#C17FFF') %>%
              rename('MM' = 'MM.C17FFF') %>% mutate(importance = (MM+abs(GS))/2) %>% arrange(by=-importance) %>%
              top_n(10)

kable(top_genes_1, caption='Top 10 genes for module #C17FFF')
Top 10 genes for module #C17FFF
ID MM GS gene.score importance
ENSG00000050748 0.8988150 -0.9399810 None 0.9193980
ENSG00000108395 0.9029160 -0.9250494 None 0.9139827
ENSG00000138078 0.8879452 -0.8966280 None 0.8922866
ENSG00000177432 0.8546653 -0.8915541 None 0.8731097
ENSG00000163577 0.8328568 -0.9116043 None 0.8722306
ENSG00000176490 0.8489868 -0.8936753 None 0.8713311
ENSG00000171132 0.8338444 -0.9028094 None 0.8683269
ENSG00000128881 0.8698308 -0.8654230 None 0.8676269
ENSG00000155097 0.8463972 -0.8885175 None 0.8674573
ENSG00000196876 0.9445612 -0.7816707 3 0.8631160
top_genes_2 = dataset %>% dplyr::select(ID, MM.C49A00, GS, gene.score) %>% filter(dataset$Module=='#C49A00') %>%
              rename('MM' = 'MM.C49A00') %>% mutate(importance = (MM+abs(GS))/2) %>% arrange(by=-importance) %>%
              top_n(10)

kable(top_genes_2, caption='Top 10 genes for module #C49A00')
Top 10 genes for module #C49A00
ID MM GS gene.score importance
ENSG00000147419 0.8141518 -0.9461082 None 0.8801300
ENSG00000134108 0.8057677 -0.8920351 None 0.8489014
ENSG00000134265 0.8401172 -0.8478985 None 0.8440079
ENSG00000165169 0.8519740 -0.8173056 None 0.8346398
ENSG00000150768 0.8124113 -0.8412765 None 0.8268439
ENSG00000134440 0.7982305 -0.8361057 None 0.8171681
ENSG00000169139 0.8424449 -0.7718000 None 0.8071224
ENSG00000138138 0.7972805 -0.7841764 None 0.7907285
ENSG00000156467 0.8373351 -0.7420724 None 0.7897037
ENSG00000075303 0.7775604 -0.7943756 None 0.7859680
top_genes_3 = dataset %>% dplyr::select(ID, MM.00C094, GS, gene.score) %>% filter(dataset$Module=='#00C094') %>%
              rename('MM' = 'MM.00C094') %>% mutate(importance = (MM+abs(GS))/2) %>% arrange(by=-importance) %>%
              top_n(10)

kable(top_genes_3, caption='Top 10 genes for module #00C094')
Top 10 genes for module #00C094
ID MM GS gene.score importance
ENSG00000026025 0.7987985 0.8872398 None 0.8430191
ENSG00000174348 0.7732747 0.8656890 None 0.8194819
ENSG00000173068 0.8396254 0.7969568 None 0.8182911
ENSG00000197321 0.9062201 0.7232536 None 0.8147369
ENSG00000142173 0.8963275 0.7327475 None 0.8145375
ENSG00000164692 0.9168698 0.7006483 None 0.8087590
ENSG00000108821 0.9018778 0.7113022 None 0.8065900
ENSG00000172296 0.7790732 0.8111975 None 0.7951354
ENSG00000096696 0.9085424 0.6529530 None 0.7807477
ENSG00000115461 0.8306216 0.7187476 None 0.7746846
plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(dataset, by='ID') %>% dplyr::select(ID, PC1, PC2, Module, gene.score) %>%
            mutate(color = ifelse(Module=='#C17FFF', '#C17FFF',
                           ifelse(Module=='#00C094', '#00C094',
                           ifelse(Module=='#C49A00', '#C49A00', 'gray')))) %>%
            mutate(alpha = ifelse(color %in% c('#C17FFF', '#00C094', '#C49A00') & 
                                  ID %in% c(as.character(top_genes_1$ID), 
                                            as.character(top_genes_2$ID),
                                            as.character(top_genes_3$ID)), 1, 0.1))

plot_data %>% ggplot(aes(PC1, PC2)) + geom_point(alpha=plot_data$alpha, color=plot_data$color) + 
              theme_minimal() + ggtitle('Important genes identified through WGCNA')